library(tidyverse)
library(reshape2)
library(forcats)
library(randomForest)
library(lubridate)
library(vegan)
library(broom)
library(biobroom)
setwd("~/Google Drive/RMB/Analyses/LifeCycle/")
The working directory was changed to /Users/edwards/Google Drive/RMB/Analyses/LifeCycle inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the the working directory for notebook chunks.
lc_data <- readRDS("~/Google Drive/RMB/Analyses/LifeCycle/lc_long_data.rds")
tax <- readRDS("~/RMB/Reference/gg_otus_tax.rds")
Run the commented code below if you want to do that
training_samples <- lc_data %>%
filter(Site == "Arbuckle" | Site == "Jonesboro") %>%
filter(Season != "2015" & paste(Site, Season) != "Arbuckle 2016") %>%
group_by(SampleID, Compartment, Age, Site, Season) %>%
summarise(n()) %>%
group_by(Compartment, Age, Site, Season) %>%
sample_frac(0.5) %>%
mutate(type = "Train")
train_data <- lc_data %>%
inner_join(training_samples %>% ungroup() %>% select(SampleID, type), by = "SampleID") %>%
group_by(variable) %>%
filter(sum(value) > 0) %>%
mutate(RA = (value / Depth) * 1000, log2value = log2(RA + 1))
test_data <- lc_data %>% ungroup() %>%
filter(paste(Season, Site) != "2016 Arbuckle") %>%
anti_join(training_samples %>% ungroup() %>% select(SampleID, type), by = "SampleID") %>%
mutate(type = "Test") %>%
group_by(variable) %>%
mutate(RA = (value / Depth) * 1000, log2value = log2(RA + 1))
#lc_data <- NULL
train_data <- readRDS("~/Google Drive/RMB/Analyses/LifeCycle/Data/RF/train_data.rds")
test_data <- readRDS("~/Google Drive/RMB/Analyses/LifeCycle/Data/RF/test_data.rds")
## When using this function for each compartment and each site start from the third column
## If using for combining both sites, start at the fourth column
rfcv_tidy <- function(x){
cv <- rfcv(x[,4:ncol(x)], x$Age, cv.fold = 10, log = T)
paste(names(cv$error), cv$error, sep = "_")
}
get_importance <- function(x){
rf <- randomForest(x[,4:ncol(x)], x$Age, importance = T)
imps <- as.data.frame(rf$importance)
imps$variable <- row.names(imps)
names(imps)[1] <- "PercIncMSE"
as_tibble(imps)
}
tidy_randomforest <- function(x) {
randomForest(x[,4:ncol(x)], x$Age, importance = F, keep.forest = T)
}
tidy_predict <- function(model, data) {
predict(model, data[,4:ncol(data)])
}
cvs <- train_data %>%
select(type, Compartment, Site, Age, log2value, variable, SampleID) %>%
spread(variable, log2value, fill = 0) %>%
group_by(Compartment, Site, type) %>%
nest() %>%
filter(Compartment != "Bulk Soil" & type == "Train") %>%
mutate(cv = map(data, ~rfcv_tidy(.)))
cvs %>%
unnest(cv) %>%
separate(cv, c("OTUs", "Error"), "_", convert = T) %>%
ggplot(aes(OTUs, Error, color = Site)) +
geom_line() +
facet_grid(.~Compartment) +
scale_x_log10()
cvs %>%
unnest(cv) %>%
separate(cv, c("OTUs", "Error"), "_", convert = T) %>%
group_by(Compartment, Site) %>%
filter(Error == min(Error))
imps <- train_data %>%
filter(Compartment != "Bulk Soil" & type == "Train" & Compartment != "Rhizoplane") %>%
select(SampleID, Age, Compartment, Site, variable, log2value) %>%
spread(variable, log2value, fill = 0) %>%
group_by(Site, Compartment) %>%
nest() %>%
mutate(imp = map(data, ~get_importance(.)))
Error: could not find function "get_importance"
rf_sparse <- train_data %>%
inner_join(top_otus, by = c("Compartment", "Site", "variable")) %>%
select(SampleID, Age, Compartment, type, variable, log2value, Site) %>%
group_by(Compartment, type, Site) %>%
nest(., .key = "train_data") %>%
mutate(train_spread_data = map(train_data, ~spread(., variable, log2value, fill = 0))) %>%
mutate(rf = map(train_spread_data, ~tidy_randomforest(.)))
joining factor and character vector, coercing into character vector
rf_sparse %>%
mutate(predictions = map2(rf, train_spread_data, predict)) %>%
unnest(train_spread_data, predictions) %>%
select(Compartment, Site, predictions, Age) %>%
ggplot(aes(Age, predictions)) +
geom_point() +
facet_grid(Compartment ~ Site)
write_rds(rf_sparse, path = "~/Google Drive/RMB/Analyses/LifeCycle/Data/RF/rf_sparse.rds")
safe_predict <- possibly(predict, NA_real_)
arb_model_test_predictions <- bind_rows(train_data, test_data) %>%
inner_join(top_otus %>% ungroup() %>% filter(Site == "Arbuckle") %>% select(Compartment, variable), by = c("variable", "Compartment")) %>%
select(SampleID, Age, variable, log2value, Site, Compartment, Season, type, Date, Cultivar) %>%
group_by(Compartment, Season, type, Site, SampleID, Age, Date, Cultivar) %>%
mutate(log2value = ifelse(is.na(log2value), 0, log2value)) %>%
nest() %>%
mutate(spread_data = map(data, ~(spread(., variable, log2value, fill = 0)))) %>%
select(-data) %>%
inner_join(rf_sparse %>% filter(Site == "Arbuckle") %>% ungroup() %>% select(Compartment, rf), by = "Compartment") %>%
mutate(predictions = map2(rf, spread_data, safe_predict)) %>%
mutate(model = paste("Arbuckle", Compartment))
joining factor and character vector, coercing into character vector
ark_model_test_predictions <- bind_rows(train_data, test_data) %>%
inner_join(top_otus %>% ungroup() %>% filter(Site == "Jonesboro") %>% select(Compartment, variable), by = c("variable", "Compartment")) %>%
select(SampleID, Age, variable, log2value, Site, Compartment, Season, type, Date, Cultivar) %>%
group_by(Compartment, Season, type, Site, SampleID, Age, Date, Cultivar) %>%
mutate(log2value = ifelse(is.na(log2value), 0, log2value)) %>%
nest() %>%
mutate(spread_data = map(data, ~(spread(., variable, log2value, fill = 0)))) %>%
select(-data) %>%
inner_join(rf_sparse %>% filter(Site == "Jonesboro") %>% ungroup() %>% select(Compartment, rf), by = "Compartment") %>%
mutate(predictions = map2(rf, spread_data, safe_predict)) %>%
mutate(model = paste("Jonesboro", Compartment))
joining factor and character vector, coercing into character vector
top_otus_combo <- rf_sparse_combo %>%
mutate(importance = map(rf, ~get_importance(.))) %>%
unnest(importance)
Error: could not find function "importance"
safe_lm <- possibly(lm, NA_real_)
site_lm <- bind_rows(train_data, test_data) %>%
filter(Compartment == "Endosphere" | Compartment == "Rhizosphere")
inner_join(same_ages, by = "Age") %>%
ungroup() %>% select(Age, SampleID, Compartment, Site, log2value, Season, variable) %>%
filter(Site == "Arbuckle" | Site == "Jonesboro") %>%
filter(Season != 2015) %>%
group_by(variable, Compartment) %>%
filter(sum(log2value > 0) / n() > 0.1) %>%
group_by(Compartment, variable, Age) %>%
nest() %>%
mutate(model = map(data, ~tidy(safe_lm(log2value ~ Site, .)))) %>%
unnest(model)
saveRDS(site_lm, "~/Google Drive/RMB/Analyses/LifeCycle/Data/site_lm.rds")
lm_classification <- site_lm %>%
filter(p.value != "NaN" & term == "SiteJonesboro") %>%
group_by(Compartment) %>%
filter(Compartment != "Bulk Soil") %>%
mutate(p.adj = p.adjust(p.value, "bon")) %>%
filter(p.adj <= 0.05) %>%
mutate(Site = ifelse(estimate < 0, "Arbuckle", "Jonesboro"))
lm_classification %>%
group_by(Compartment, Site, Age) %>%
summarise(n = n()) %>% ungroup() %>%
mutate(Compartment = fct_relevel(Compartment, "Rhizosphere", "Endosphere")) %>%
ggplot(aes(Age, n, fill = Site)) +
geom_bar(stat = "identity", position = "dodge") +
facet_grid(.~Compartment) +
scale_fill_manual(values = c("#f57670ff", "steelblue")) +
theme_minimal()
lm_classification %>%
group_by(Compartment, variable) %>%
summarise(n = n()) %>%
group_by(Compartment) %>%
summarise(n = n())
lm_classification_abund <- bind_rows(train_data, test_data) %>%
inner_join(lm_classification, by = c("variable", "Compartment", "Site", "Age")) %>%
group_by(Compartment, Site, Age, SampleID) %>%
summarise(total = sum(RA))
lm_classification_abund %>%
group_by(Compartment, Site, Age) %>%
summarise(mean_ab = mean(total / 10), se = sd(total / 10) / sqrt(n())) %>% ungroup() %>%
mutate(Compartment = fct_relevel(Compartment, "Rhizosphere", "Endosphere")) %>%
ggplot(aes(Age, mean_ab, fill = Site)) +
geom_bar(stat = "identity", position = "dodge") +
geom_errorbar(aes(ymin = mean_ab - se, ymax = mean_ab + se), position = position_dodge(width = 12.5), width = 0.1, size = 0.2) +
facet_grid(.~Compartment) +
scale_fill_manual(values = c("#f57670ff", "steelblue")) +
theme_minimal()
lm_classification_abund %>%
group_by(Compartment, Site) %>%
nest() %>%
mutate(model = map(data, ~tidy(lm(total ~ Age, .)))) %>%
unnest(model) %>%
filter(term != "(Intercept)") %>%
mutate(p.adj = p.adjust(p.value, "bon"))
lm_classification %>%
group_by(Compartment, Site, variable) %>%
summarise(count = n()) %>%
group_by(count, Site, Compartment) %>%
summarise(count2 = n()) %>%
ggplot(aes(count, count2)) +
geom_bar(stat = "identity") +
facet_grid(Compartment ~ Site)
Are early colonizing OTUs differentially abundant from the soil
peak_per_age_rs %>%
select(variable, peak = Age) %>%
inner_join(spline_predictions_rs, by = "variable") %>%
inner_join(tax, by = "variable") %>%
mutate(group_var = paste(variable, Family)) %>%
group_by(variable) %>%
mutate(scaled_percent = (percent - min(percent)) / (max(percent) - min(percent)),
scaled_fit = (.fitted - min(.fitted)) / (max(.fitted) - min(.fitted))) %>%
ungroup() %>%
mutate(variable = reorder(variable, peak)) %>%
arrange(variable) %>%
mutate(variable.f = reorder(as.character(variable), desc(variable))) %>%
group_by(variable.f) %>%
nest(-variable.f) %>%
mutate(order = (0:(nrow(.)-1))*0.5) %>%
mutate(order2 = 1:nrow(.)) %>%
unnest() %>%
inner_join(otu_directions_combo %>% ungroup() %>% select(Compartment, classification, variable), by = c("variable", "Compartment")) %>%
group_by(variable, Compartment, order2, Kingdom, Phylum, Class, Order, Genus, Species, classification) %>%
summarise(n = n()) %>% write_tsv("~/Google Drive/RMB/Analyses/LifeCycle/TABLES/ad_rs_plot_order.tsv")
Column `variable` joining factor and character vector, coercing into character vector
model_test_predictions_drought %>%
unnest(predictions) %>%
group_by(Compartment, Soil) %>%
nest() %>%
mutate(an = map(data, ~aov(predictions ~ Treatment * Cultivar, .))) %>%
mutate(thst = map(an, ~tidy(TukeyHSD(.)))) %>%
unnest(thst) %>%
filter(adj.p.value <= 0.05)